Analysis by LEA Sums¶

In this notebook, about 7,800 districts are analysed. The data includes the total student enrollment in various programs and high school level courses. Districts with low enrollment are removed from the analysis. The analysis consists of principal component analysis (PCA), K-Means clustering, linear discriminant analysis (LDA), correlation, covariance, and multiple regression predicting poverty rate based on enrollment.

In [1]:
query="""
SELECT 
	rls.leaid
	,min(rls.lea_name) AS lea_name
	,min(rls.lea_state) as lea_state
	,sum(GREATEST(advmath.tot_mathenr_advm_m,0) + GREATEST(advmath.tot_mathenr_advm_f,0)) AS advmath_enr
	,sum(GREATEST(advpl.TOT_APEXAM_NONE_M,0) + GREATEST(advpl.TOT_APEXAM_NONE_F,0)) AS advpl_noexam
	,sum(GREATEST(alg1.TOT_ALGPASS_GS0910_M,0) + GREATEST(alg1.TOT_ALGPASS_GS0910_F,0)) AS alg1_0910_passed
	,sum(GREATEST(alg1.TOT_ALGPASS_GS1112_M,0) + GREATEST(alg1.TOT_ALGPASS_GS1112_F,0)) AS alg1_1112_passed
	,sum(GREATEST(alg2.tot_mathenr_alg2_m,0) + GREATEST(alg2.tot_mathenr_alg2_f,0)) AS alg2_enr
	,sum(GREATEST(bio.TOT_SCIENR_BIOL_M,0) + GREATEST(bio.TOT_SCIENR_BIOL_F,0)) AS bio_enr
	,sum(GREATEST(calc.TOT_MATHENR_CALC_M,0) + GREATEST(calc.TOT_MATHENR_CALC_F,0)) AS calc_enr
	,sum(GREATEST(chem.TOT_SCIENR_CHEM_M,0) + GREATEST(chem.TOT_SCIENR_CHEM_F,0)) AS chem_enr
	,sum(GREATEST(dual.TOT_DUAL_M,0) + GREATEST(dual.TOT_DUAL_F,0)) AS dual_enr
	,sum(GREATEST(enr.tot_enr_m,0) + GREATEST(enr.tot_enr_f,0)) AS total_enr
	,sum(GREATEST(enr.SCH_ENR_LEP_M,0) + GREATEST(enr.SCH_ENR_LEP_F,0)) AS enr_lep
	,sum(GREATEST(enr.SCH_ENR_504_M,0) + GREATEST(enr.SCH_ENR_504_F,0)) AS enr_504
	,sum(GREATEST(enr.SCH_ENR_IDEA_M,0) + GREATEST(enr.SCH_ENR_IDEA_F,0)) AS enr_idea
	,sum(GREATEST(geo.TOT_MATHENR_GEOM_M,0) + GREATEST(geo.TOT_MATHENR_GEOM_F,0)) AS geo_enr
	,sum(GREATEST(phys.TOT_SCIENR_PHYS_M,0) + GREATEST(phys.TOT_SCIENR_PHYS_F,0)) AS phys_enr
	,sum(GREATEST(satact.TOT_SATACT_M,0) + GREATEST(satact.TOT_SATACT_F,0)) AS satact
	,avg(saipe.totalpopulation) AS totalpopulation 
	,avg(saipe.population5_17) AS population5_17
	,avg(saipe.population5_17inpoverty) AS population5_17inpoverty
FROM ref_schema.ref_lea_sch rls
JOIN data_schema.sch_advancedmathematics advmath ON advmath.combokey = rls.combokey
JOIN data_schema.sch_advancedplacement advpl ON advpl.combokey = rls.combokey
JOIN data_schema.sch_algebrai alg1 ON alg1.combokey = rls.combokey
JOIN data_schema.sch_algebraii alg2 ON alg2.combokey = rls.combokey 
JOIN data_schema.sch_biology bio ON bio.combokey = rls.combokey 
JOIN data_schema.sch_calculus calc ON calc.combokey = rls.combokey 
JOIN data_schema.sch_chemistry chem ON chem.combokey = rls.combokey 
JOIN data_schema.sch_dualenrollment dual ON dual.combokey = rls.combokey 
JOIN data_schema.sch_enrollment enr ON enr.combokey = rls.combokey 
JOIN data_schema.sch_geometry geo ON geo.combokey = rls.combokey 
JOIN data_schema.sch_physics phys ON phys.combokey = rls.combokey 
JOIN data_schema.sch_satandact satact ON satact.combokey = rls.combokey 
JOIN data_schema.sch_schoolcharacteristics chr ON chr.combokey = rls.combokey 
JOIN data_schema.saipe_ussd17 saipe ON saipe.leaid = rls.leaid
WHERE chr.hs_only = TRUE
group by rls.leaid
order by leaid;
"""
In [2]:
from sqlalchemy import create_engine
db_params = {
    "database": "postgres",
    "user": "postgres",
    "password": "pwd123",
    "host": "postgres-db",
    "port": "5432"
}
connection_string = f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['database']}"
engine = create_engine(connection_string)
In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from kneed import KneeLocator
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
In [4]:
# df = pd.read_csv('LEA_agg_data_sums.csv')
In [5]:
df = pd.read_sql(query, engine)
In [6]:
df.columns
Out[6]:
Index(['leaid', 'lea_name', 'lea_state', 'advmath_enr', 'advpl_noexam',
       'alg1_0910_passed', 'alg1_1112_passed', 'alg2_enr', 'bio_enr',
       'calc_enr', 'chem_enr', 'dual_enr', 'total_enr', 'enr_lep', 'enr_504',
       'enr_idea', 'geo_enr', 'phys_enr', 'satact', 'totalpopulation',
       'population5_17', 'population5_17inpoverty'],
      dtype='object')
In [7]:
exclude_cols = ['leaid', 'lea_name', 'lea_state', 
                'totalpopulation', 'population5_17',
                'population5_17inpoverty', 'total_enr']
columns_to_modify = df.columns.difference(exclude_cols)
df[columns_to_modify] = df[columns_to_modify].clip(lower=0)
In [8]:
enrollment_sum = df['total_enr']
columns_to_modify = df.columns.difference(exclude_cols)
df[columns_to_modify] = df[columns_to_modify].div(enrollment_sum, axis=0).fillna(0)
In [9]:
df[enrollment_sum <= 10][['total_enr','leaid',
                         'lea_state','totalpopulation']]
Out[9]:
total_enr leaid lea_state totalpopulation
765 6 0804500 CO 1853.0
1491 6 1726880 IL 159563.0
2625 0 2509360 MA 54172.0
3164 8 2730870 MN 4559.0
3699 8 3027930 MT 195.0
3701 5 3028170 MT 359.0
4710 7 3812930 ND 1230.0
5889 2 4217130 PA 4671.0
6840 7 4831470 TX 2079.0
7058 5 4844710 TX 1986.0
7331 7 5305040 WA 515.0
In [10]:
df = df[enrollment_sum > 10]
df = df.reset_index(drop=True)
In [11]:
df['5_17_poverty_percent'] = df['population5_17inpoverty']/df['population5_17']
In [12]:
df.columns.difference(exclude_cols)
Out[12]:
Index(['5_17_poverty_percent', 'advmath_enr', 'advpl_noexam',
       'alg1_0910_passed', 'alg1_1112_passed', 'alg2_enr', 'bio_enr',
       'calc_enr', 'chem_enr', 'dual_enr', 'enr_504', 'enr_idea', 'enr_lep',
       'geo_enr', 'phys_enr', 'satact'],
      dtype='object')
In [13]:
df.head()
Out[13]:
leaid lea_name lea_state advmath_enr advpl_noexam alg1_0910_passed alg1_1112_passed alg2_enr bio_enr calc_enr ... enr_lep enr_504 enr_idea geo_enr phys_enr satact totalpopulation population5_17 population5_17inpoverty 5_17_poverty_percent
0 0100005 Albertville City AL 0.222912 0.003451 0.292616 0.002761 0.148378 0.365079 0.034507 ... 0.118703 0.010352 0.052450 0.227053 0.194617 0.250518 21786.0 4115.0 1546.0 0.375699
1 0100006 Marshall County AL 0.116970 0.000000 0.208054 0.000959 0.292426 0.518696 0.033557 ... 0.044104 0.015340 0.114094 0.371045 0.000000 0.234899 48481.0 8762.0 2495.0 0.284752
2 0100007 Hoover City AL 0.345115 0.026345 0.009221 0.000439 0.053568 0.306915 0.078375 ... 0.131504 0.041493 0.102964 0.260154 0.030296 0.403074 82783.0 14679.0 1038.0 0.070713
3 0100008 Madison City AL 0.221141 0.004866 0.188970 0.000811 0.016491 0.124899 0.077589 ... 0.020816 0.019194 0.069208 0.137875 0.062449 0.273587 46797.0 9683.0 735.0 0.075906
4 0100011 Leeds City AL 0.143443 0.002049 0.196721 0.000000 0.229508 0.272541 0.038934 ... 0.116803 0.030738 0.141393 0.258197 0.000000 0.198770 11900.0 1742.0 302.0 0.173364

5 rows × 23 columns

PCA¶

In [14]:
ids = df['leaid'].values
lea_names = df['lea_name'].values
states = df['lea_state'].values
pop5_17 = df['population5_17']
pov5_17 = df['5_17_poverty_percent']
In [15]:
ids = df['leaid'].values

# Step 1: Subset the DataFrame
subset_df = df[df.columns.difference(exclude_cols)]
for_pca_use = df[df['total_enr'] > 15][df.columns.difference(exclude_cols)]

# Step 2: Standardize the data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(subset_df)
pca_data = scaler.fit_transform(for_pca_use)

# Step 3: Compute covariance matrix, eigenvectors, and eigenvalues for PCA
cov_matrix = np.cov(pca_data, rowvar=False)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sort eigenvectors by eigenvalue size (descending order)
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvectors = eigenvectors[:, sorted_indices]
eigenvalues = eigenvalues[sorted_indices]

# Step 4: Project data onto the top 3 principal components
projected_data = np.dot(pca_data, eigenvectors[:, :3])

# Step 5: Create an interactive 3D plot using Plotly
trace = go.Scatter3d(
    x=projected_data[:, 0],
    y=projected_data[:, 1],
    z=projected_data[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.5),
    text=[f"LEA ID: {i}, {state}<br>LEA Name: {lea}<br>5_17 Pop: {int(pop)}<br>5_17 Pov: {100*pov:.2f}%" 
          for i, lea, state, pop, pov in zip(ids, lea_names, states, pop5_17, pov5_17)],  
    # Display ID, School Name, and LEA Name when hovering
    hoverinfo="text+x+y+z"
)

PC1_range = [projected_data[:, 0].min(),projected_data[:, 0].max()]
PC2_range = [projected_data[:, 1].min(),projected_data[:, 1].max()]
PC3_range = [projected_data[:, 2].min(),projected_data[:, 2].max()]
for i in range(1,4):
    exec(f"extension = 0.1*(PC{i}_range[1] - PC{i}_range[0])")
    exec(f"PC{i}_range[0] -= extension")
    exec(f"PC{i}_range[1] += extension")

layout = go.Layout(
    title="Data Projected on Top 3 Principal Components",
    scene=dict(
        xaxis=dict(
            title="Principal Component 1",
            range=[projected_data[:, 0].min(), projected_data[:, 0].max()]  
        ),
        yaxis=dict(
            title="Principal Component 2"
        ),
        zaxis=dict(
            title="Principal Component 3"
        )
    )
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)
In [16]:
extreme_PC1 = df.iloc[np.argsort(np.abs(projected_data[:, 0]))[-3:]]
extreme_PC1.T
Out[16]:
1288 2368 4117
leaid 1704440 2200810 3500030
lea_name Astoria CUSD 1 Jefferson Davis Parish ALAMOGORDO PUBLIC SCHOOLS
lea_state IL LA NM
advmath_enr 0.077586 0.187861 0.191298
advpl_noexam 0.0 0.0 0.009669
alg1_0910_passed 0.12069 0.176301 0.174033
alg1_1112_passed 0.0 0.011561 0.004834
alg2_enr 0.112069 0.15896 0.260359
bio_enr 0.362069 0.124277 0.370166
calc_enr 0.043103 0.0 0.014503
chem_enr 0.068966 0.193642 0.133978
dual_enr 0.025862 0.115607 0.058011
total_enr 116 346 1448
enr_lep 0.0 0.0 0.017956
enr_504 0.034483 0.026012 0.020028
enr_idea 0.146552 0.083815 0.120856
geo_enr 0.077586 0.179191 0.283149
phys_enr 0.103448 0.0 0.020028
satact 0.267241 0.260116 0.0
totalpopulation 1988.0 31477.0 44672.0
population5_17 321.0 5875.0 6725.0
population5_17inpoverty 70.0 1342.0 1713.0
5_17_poverty_percent 0.218069 0.228426 0.254721
In [17]:
pc1 = eigenvectors[:, 0]
pc2 = eigenvectors[:, 1]
In [18]:
df.columns.difference(exclude_cols)
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(pc1)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*pc1[i]:.2f}%")
Column Name         : PC1 Weight
5_17_poverty_percent: 28.36%
advmath_enr         : -35.97%
advpl_noexam        : -16.81%
alg1_0910_passed    : 2.61%
alg1_1112_passed    : 3.48%
alg2_enr            : -28.03%
bio_enr             : -25.92%
calc_enr            : -32.60%
chem_enr            : -42.54%
dual_enr            : -4.66%
enr_504             : -23.12%
enr_idea            : 13.35%
enr_lep             : 7.09%
geo_enr             : -22.62%
phys_enr            : -38.45%
satact              : -22.26%
In [19]:
print(f"{'Column Name'.ljust(20)}: PC2 Weight")
for i in range(len(pc2)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*pc2[i]:.2f}%")
Column Name         : PC2 Weight
5_17_poverty_percent: 36.17%
advmath_enr         : -10.91%
advpl_noexam        : -9.36%
alg1_0910_passed    : 45.12%
alg1_1112_passed    : 25.39%
alg2_enr            : 36.92%
bio_enr             : 33.55%
calc_enr            : -23.87%
chem_enr            : 7.18%
dual_enr            : 2.63%
enr_504             : -20.18%
enr_idea            : 5.47%
enr_lep             : 15.23%
geo_enr             : 44.89%
phys_enr            : 0.33%
satact              : -1.92%
In [20]:
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(standardized_data)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()
No description has been provided for this image
In [21]:
knee = KneeLocator(k_range, inertia, curve="convex", direction="decreasing")

# Elbow point
optimal_k = knee.elbow

print(f"The optimal number of clusters (k) is: {optimal_k}")
The optimal number of clusters (k) is: 4
In [22]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(standardized_data)
In [23]:
enr_cols = []
unique_clusters = np.unique(df['cluster'])
print(f"{'Cluster'.ljust(10)}: LEAs in Dataset")
for cluster in unique_clusters:
    count = np.sum(df['cluster'] == cluster)
    print(f"{str(cluster).ljust(10)}: {count}")
Cluster   : LEAs in Dataset
0         : 1759
1         : 2989
2         : 61
3         : 3006
In [24]:
def lda(X, y):
    mean = X.mean(axis=0)
    class_labels = np.unique(y)
    m, x_m, n = [[],[],[]]
    for cl in class_labels:
        data = X[y == cl]
        m.append(data.mean(axis=0))
        x_m.append(data - m[-1])
        n.append(len(data))
    Sw = sum((xm.T @ xm) for xm in x_m)
    Sb = sum((np.outer(d,d)*n_i) for d, n_i in zip(m-mean,n))
    eigval,eigvec=np.linalg.eig(np.linalg.inv(Sw)@Sb)
    idx = np.argsort(eigval)[::-1]
    return eigval[idx],np.real(eigvec[:,idx])
In [25]:
X = standardized_data
y = df['cluster']
eigval,eigvec = lda(X, y)
X_lda = X@eigvec

# Ensure that X_lda has at least 3 components for 3D plotting
if X_lda.shape[1] < 3:
    # Pad with zeros if fewer than 3 components
    X_lda = np.pad(X_lda, ((0, 0), (0, 3 - X_lda.shape[1])), mode='constant')

# Create an interactive 3D plot using Plotly
trace = go.Scatter3d(
    x=X_lda[:, 0],
    y=X_lda[:, 1],
    z=X_lda[:, 2],
    mode='markers',
    marker=dict(size=5, color=y, opacity=0.8),
    text=[f"LEA ID: {i}, {state}<br>LEA Name: {lea}<br>5_17 Pop: {int(pop)}<br>5_17 Pov: {100*pov:.2f}%" 
          for i, lea, state, pop, pov in zip(ids, lea_names, states, pop5_17, pov5_17)],  
    # Display ID, School Name, and LEA Name when hovering
    hoverinfo="text+x+y+z"
)



layout = go.Layout(
    title="LDA Projection on Top 3 Discriminant Components",
    scene=dict(
        xaxis_title="LDA Component 1",
        yaxis_title="LDA Component 2",
        zaxis_title="LDA Component 3"
    )
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)
In [26]:
extreme_LDA = df.iloc[np.argsort(np.abs(X_lda[:, 0]))[-3:]]
extreme_LDA.T
Out[26]:
186 3038 1220
leaid 0406960 2636600 1602520
lea_name San Carlos Unified District Yale Public Schools OROFINO JOINT DISTRICT
lea_state AZ MI ID
advmath_enr 0.0 0.0 0.0
advpl_noexam 0.0 0.044343 0.0
alg1_0910_passed 0.137255 0.463303 0.151261
alg1_1112_passed 0.372549 0.474006 0.815126
alg2_enr 0.392157 0.0 0.252101
bio_enr 0.490196 0.0 0.521008
calc_enr 0.0 0.0 0.0
chem_enr 0.0 0.0 0.0
dual_enr 0.0 0.0 0.0
total_enr 51 654 119
enr_lep 0.0 0.003058 0.0
enr_504 0.0 0.027523 0.0
enr_idea 0.0 0.091743 0.12605
geo_enr 0.313725 0.0 0.0
phys_enr 0.0 0.0 0.0
satact 0.0 0.0 0.0
totalpopulation 5292.0 11026.0 8799.0
population5_17 1262.0 2032.0 1092.0
population5_17inpoverty 586.0 252.0 210.0
5_17_poverty_percent 0.464342 0.124016 0.192308
cluster 2 2 2
In [27]:
eig1, eig2, eig3 =(eigvec.T)[:3] # column = eigvec
exclude_cols.append('cluster')
In [28]:
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(eig1)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*eig1[i]:.2f}%")
Column Name         : PC1 Weight
5_17_poverty_percent: -33.37%
advmath_enr         : 35.60%
advpl_noexam        : 18.70%
alg1_0910_passed    : -8.33%
alg1_1112_passed    : -42.47%
alg2_enr            : 9.28%
bio_enr             : 8.19%
calc_enr            : 23.88%
chem_enr            : 29.95%
dual_enr            : -2.70%
enr_504             : 25.77%
enr_idea            : -11.12%
enr_lep             : -4.13%
geo_enr             : 10.75%
phys_enr            : 51.92%
satact              : 12.40%
In [29]:
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(eig2)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*eig2[i]:.2f}%")
Column Name         : PC1 Weight
5_17_poverty_percent: -4.79%
advmath_enr         : 9.79%
advpl_noexam        : 3.58%
alg1_0910_passed    : 1.56%
alg1_1112_passed    : 96.99%
alg2_enr            : 10.55%
bio_enr             : 3.33%
calc_enr            : 7.26%
chem_enr            : 6.97%
dual_enr            : -1.30%
enr_504             : 5.24%
enr_idea            : -3.52%
enr_lep             : -4.30%
geo_enr             : 8.44%
phys_enr            : 8.95%
satact              : 4.87%

Covariance¶

In [30]:
standardized_df = pd.DataFrame(standardized_data)
standardized_df.columns = df.columns.difference(exclude_cols)
correlation_matrix = standardized_df.cov()
In [31]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap="bwr", cbar=True)
plt.title('Correlation Matrix Heatmap')
plt.show()
No description has been provided for this image
In [32]:
covariance_matrix = df[df.columns.difference(exclude_cols)].cov()
plt.figure(figsize=(12, 8))
sns.heatmap(covariance_matrix, annot=False, fmt=".2f", cmap="bwr", cbar=True)
plt.title('Covariance Matrix Heatmap')
plt.show()
No description has been provided for this image

Multiple Regression¶

In [33]:
dependent_var = '5_17_poverty_percent'
independent_vars = df.columns.difference(exclude_cols + [dependent_var])
In [34]:
high_p_vals = ['alg1_1112_passed','dual_enr','enr_idea']
independent_vars = independent_vars.difference(high_p_vals)
independent_vars
Out[34]:
Index(['advmath_enr', 'advpl_noexam', 'alg1_0910_passed', 'alg2_enr',
       'bio_enr', 'calc_enr', 'chem_enr', 'enr_504', 'enr_lep', 'geo_enr',
       'phys_enr', 'satact'],
      dtype='object')
In [35]:
X = df[independent_vars]
X = sm.add_constant(X)
Y = df[dependent_var]
model = sm.OLS(Y, X).fit()
model.summary()
Out[35]:
OLS Regression Results
Dep. Variable: 5_17_poverty_percent R-squared: 0.217
Model: OLS Adj. R-squared: 0.216
Method: Least Squares F-statistic: 180.5
Date: Fri, 30 Aug 2024 Prob (F-statistic): 0.00
Time: 03:29:47 Log-Likelihood: 8433.3
No. Observations: 7815 AIC: -1.684e+04
Df Residuals: 7802 BIC: -1.675e+04
Df Model: 12
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 0.1917 0.004 44.061 0.000 0.183 0.200
advmath_enr -0.1115 0.010 -11.004 0.000 -0.131 -0.092
advpl_noexam -0.0917 0.016 -5.657 0.000 -0.123 -0.060
alg1_0910_passed 0.0390 0.013 3.095 0.002 0.014 0.064
alg2_enr 0.0266 0.013 2.008 0.045 0.001 0.053
bio_enr 0.0437 0.009 4.608 0.000 0.025 0.062
calc_enr -0.5978 0.021 -28.252 0.000 -0.639 -0.556
chem_enr -0.0387 0.012 -3.332 0.001 -0.061 -0.016
enr_504 -0.2883 0.028 -10.443 0.000 -0.342 -0.234
enr_lep 0.2303 0.015 15.175 0.000 0.201 0.260
geo_enr 0.0605 0.014 4.479 0.000 0.034 0.087
phys_enr -0.0177 0.010 -1.810 0.070 -0.037 0.001
satact -0.0459 0.008 -5.682 0.000 -0.062 -0.030
Omnibus: 1219.248 Durbin-Watson: 1.498
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2156.345
Skew: 1.008 Prob(JB): 0.00
Kurtosis: 4.598 Cond. No. 34.4


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [36]:
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
vif_data
Out[36]:
Variable VIF
0 const 21.837002
1 advmath_enr 1.197135
2 advpl_noexam 1.045260
3 alg1_0910_passed 1.088459
4 alg2_enr 1.218922
5 bio_enr 1.162986
6 calc_enr 1.133672
7 chem_enr 1.352108
8 enr_504 1.091597
9 enr_lep 1.025601
10 geo_enr 1.204435
11 phys_enr 1.247579
12 satact 1.083754